\documentclass[10pt]{article}
\usepackage{color}
\definecolor{gray}{rgb}{0.7,0.7,0.7}
\usepackage{framed}
\usepackage{enumitem}
\usepackage{longtable}
\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref}

\addtolength{\textwidth}{3.4cm}
\addtolength{\hoffset}{-1.7cm}
\addtolength{\textheight}{4cm}
\addtolength{\voffset}{-2cm}


\begin{document}

\title{ANGSD formats}
\author{tsk}
\maketitle
\vspace*{1em}


\section{SAF formats}
SAF files are files that contain sample allele frequency. These are generated with -doSaf in main ANGSD. These contains either the loglikelihood ratio to the most likely category or the pp. This is determined if the -prior has been supplied.
The first 8 bytes magic number determines which SAF version. If no magic number is present then version0 is assumed.
\subsection{version 0}
First version of the SAF files were simply flat binary double files \texttt{PREFIX.saf} along with an associated \texttt{PREFIX.saf.pos.gz} which contains the gzip compressed 'chromosome position'. Assuming \emph{nChr} number of chromosomes, then we have \emph{nChr+1} categories for each site. The number of sites can therefore be deduced either directly from the number of lines in the uncompressed output of the \texttt{PREFIX.saf.pos.gz}, or by using the filesize (\emph{fsize}) of the \texttt{PREFIX.saf} $$\frac{fsize}{sizeof(double)*(nChr+1)}.$$
\subsection{version 1}

Second iteration of the saf files now contains two raw files and an index file. First 8 bytes in all three files is 8byte magic numer \emph{char[8] ``safv3''}. 
\begin{itemize}
\item[PREFIX.saf.gz] bgzf compressed flat floats. With similar interpretation as version0. Each element is a cdatatype 'float' which is 4 bytes.
\item[PREFIX.saf.pos.gz] bgzf compressed flat integer. Representing the position. Each element is a cdatatype 'int' which is 4bytes
\item[PREFIX.saf.idx] uncompressed binary file containing blocks of data described in \ref{tab1}. This is preceeded by a size\_t value which indicates the number of categories in the spectrum.
\end{itemize}


\begin{table}
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf CLEN} & size\_t &  Length of CHR (not including terminating null)\\
  2 & {\sf CHR} & char* & Reference sequence name. Length is CLEN\\
  3 & {\sf NSITES} & size\_t & Number of sites with coverage from reference CHR\\
  4 & {\sf OFF1} & long int & CHR offset into the PREFIX.saf.pos.gz \\
  5 & {\sf OFF2} & long int & CHR offset into the PREFIX.saf.gz \\
  \hline
\end{tabular}\label{tab1}
\caption{Content of entry for a single reference name in the PREFIX.saf.idx file.}
\end{table}
 Note that it is not possible to deduce the number of sites directly
 from the compressed files.
\clearpage
\subsection{Version 2}
This section describes the banded representation of the sample allele frequency likelihoods.  First 8 bytes in all three files is 8byte magic numer \emph{char[8] ``safv4''}.  
\begin{itemize}
\item[PREFIX.saf.gz] bgzf compressed. Full description in Table \ref{tab3}.
\item[PREFIX.saf.pos.gz] bgzf compressed flat integer. Representing   the position. Each element is a cdatatype 'int' which is  4bytes. Similar to version 1.
\item[PREFIX.saf.idx] uncompressed binary file containing blocks of   data described in \ref{tab4}. 
\end{itemize}

\begin{table}[h]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf CLEN} & size\_t &  Length of CHR (not including terminating null)\\
  2 & {\sf CHR} & char* & Reference sequence name. Length is CLEN\\
  3 & {\sf NSITES} & size\_t & Number of sites with coverage from reference CHR\\
  3 & {\sf SUMBAND} & size\_t & Sum of bins from reference CHR\\
  4 & {\sf OFF1} & long int & CHR offset into the PREFIX.saf.pos.gz \\
  5 & {\sf OFF2} & long int & CHR offset into the PREFIX.saf.gz \\
  \hline
\end{tabular}\label{tab4}
\caption{Content of entry for a single reference name in the PREFIX.saf.idx file.}
\end{table}

\begin{table}[h]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf FIRST} & int32\_t  &  First category with data)\\
  2 & {\sf NCATS} & int32\_t & Number of categories with sample allele frequencies\\
  3 & {\sf SAFLH} & float4\_t[NCATS] & The actual sample allele frequencies\\
  \hline
\end{tabular}
\caption{Record for the sample allele frequencies for a single site. Notice that the SAFLH are loglikelihood ratios to the most likely. Scaling is natural log.}
\label{tab3}
\end{table}

\clearpage
\section{fst formats}
This section describes the binary output generated by a \textbf{realSFS fst index pop1.saf.idx pop2.saf.idx -sfs prior}
\subsection{fstv1}

First iteration of the fst files contains two files. 1) PREFIX.fst.idx 2) PREFIX.fst.gz.
First 8bytes is a magic number determining which binary version.


\begin{itemize}
\item[PREFIX.fst.idx] flat file, described in table \ref{tab2}
\item[PREFIX.fst.gz] bgzf compressed binary file.
\end{itemize}
\subsubsection{PREFIX.fst.idx}
The fst.idx has a very simple header 8bytes magicheader followed by a \texttt{size\_t} containing the number of samples for which we have generated fst results.
\begin{table}[h]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf CLEN} & size\_t &  Length of CHR (not including terminating null)\\
  2 & {\sf CHR} & char* & Reference sequence name. Length is CLEN\\
  3 & {\sf NSITES} & size\_t & Number of sites with coverage from reference CHR\\
  4 & {\sf OFF1} & long int & CHR offset into the PREFIX.saf.pos.gz \\
  \hline
\end{tabular}\label{tab2}
\caption{Content of the PREFIX.fst.idx}
\end{table}
\subsubsection{PREFIX.fst.gz}
\begin{table}[h]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf POSI} & int &  Length of CHR (not including terminating null)\\
  2 & {\sf acoef1} & double* & $\alpha$ coefficients from either reynolds estimator or Bhatia \\
  3 & {\sf bcoef2} & double* & $\beta$ coeffficients from eithre Reynolds estimator or Bhatia\\
  \hline
\end{tabular}\label{tabfstgz}
\caption{Contents of the PREFIX.fst.gz file}
\end{table}

\newpage
\section{theta formats}
From 0.917 onwards, the -doThetas in angsd wont generate the old ASCII files but rather the indexed file as described below. The original format will not be described in this document.\\


\subsection{thetav2}
Second iteration of the theta files now contains one raw bgzf compressed data file and an uncompressed index file. First 8 bytes in the (uncompressed) files are 8byte magic numer \emph{char[8] ``thetav2''}. These are generated if the options -dosaf 1 and -doThetas 1 has been selected. This will output the following two files:

\begin{itemize}
\item[prefix.thetas.idx] Small uncompressed binary file that contains chr,number of sites, number of chromosomes and the offset into the main data file contain the theta estimates. See below
\item[prefix.thetas.gz] Main file. Does also contain chr, number of sites number of chromsomes.
\end{itemize}

\subsubsection{Theta definitions}
Let $\eta$ be the site frequency spectra. Then $\eta_i$ is the posterior probablity of being in frequency $i$. 
\begin{itemize}
\item[Watterson] $\sum_{i=1}^{n-1}\eta_i/a^{-1}, a=\sum_{i=1}^{n-1}i$
\item[$\pi$] $ {{n}\choose{2}}^{-1}\sum_{i=1}^{n-1}i(n-1)\eta_i$
\item[FuLi] $\eta_1$
\item[FayH] $ {{n}\choose{2}}^{-1}\sum_{i=1}^{n-1}i^2\eta_i$ 
\item[L] $ {n-1}^{-1}\sum_{i=1}^{n-1}i\eta_i$ 
\end{itemize}

\subsubsection{Description of binary files}


\begin{table}[ht]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf CLEN} & size\_t &  Length of CHR (inferred by strlen)\\
  2 & {\sf CHR} & char* & Reference sequence name. Length is CLEN\\
  3 & {\sf NSITES} & size\_t & Number of sites with coverage from reference CHR\\
  4 & {\sf NCHR} & size\_t & number of possible derived/minor allels. (2*nInd for the unfolded, nInd for the folded)\\
  5 & {\sf OFF} & long int & CHR offset into the thetas.gz \\
  \hline
\end{tabular}\label{thetaidx}
\caption{Content of entry for a single reference name in the PREFIX.thetas.idx file. Note that there exists a 8byte magicnumber in the beginning of the file.}
\end{table}

\begin{table}[h]
\begin{tabular}{rllll}
  \hline
  {\bf Col} & {\bf Field} & {\bf Type} & {\bf Brief description} \\
  \hline
  1 & {\sf CLEN} & size\_t &  Length of CHR (inferred by strlen)\\
  2 & {\sf CHR} & char* & Reference sequence name. Length is CLEN\\
  3 & {\sf NSITES} & size\_t & Number of sites with coverage from reference CHR\\
  4 & {\sf NCHR} & size\_t & number of possible derived/minor allels. (2*nInd for the unfolded, nInd for the folded)\\
  5 & {\sf POSI} & int[NSITES] & zero indexed positions for CHR \\
  5 & {\sf Watterson} & float[NSITES] & logscaled persite estimates of Watterson theta estimator (number of segregating sites) \\
  5 & {\sf $\pi$} & float[NSITES] & logscaled persite estimates of the Tajima theta estimator (pairwise differences) \\
  5 & {\sf FuLi} & float[NSITES] & logscaled persite estimates of the fuli theta estmator (singleton category) \\
  5 & {\sf FayH} & float[NSITES] & logscaled persite estimates of the FayH theta estimator\\
  L & {\sf L} & float[NSITES] & logscaled persite estimates of the L theta estimator  \\
  \hline
\end{tabular}\label{thetagz}
\caption{Content of the PREFIX.thetas.gz file. Note that there exists a 8byte magicnumber in the beginning of the file.}
\end{table}

\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
